Mineria de datos - Proyecto Final
Librerías
library(tidyverse)
library(rmdformats)
library(stats)
library(cluster)
library(mclust)
library(factoextra)
library(dendextend)
library(DT)
library(purrr)
library(igraph)
library(tidygraph)
library(ggraph)
library(ggpubr)
library(clustertend)
library(fpc)
library(FactoMineR)
library(factoextra)
library(pvclust)
library(cluster.datasets)
library(mltools)
library(data.table)Datos
file <- "/Users/antony.vargasulead.ac.cr/Mineria de datos/Proyecto Final/Data/Heart.csv"
df <- read.csv(file, sep = ",", dec = ".")
df <- df[,!(names(df) %in% "target")]
DT::datatable(df)2. Análisis descriptivo
3. Análisis no Supervisado:
3.1 Análisis de componentes principales
PCA <- PCA(df, graph = FALSE, dim(df)[2])
PCA## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 1025 individuals, described by 13 variables
## *The results are available in the following objects:
##
## name description
## 1 "$eig" "eigenvalues"
## 2 "$var" "results for the variables"
## 3 "$var$coord" "coord. for the variables"
## 4 "$var$cor" "correlations variables - dimensions"
## 5 "$var$cos2" "cos2 for the variables"
## 6 "$var$contrib" "contributions of the variables"
## 7 "$ind" "results for the individuals"
## 8 "$ind$coord" "coord. for the individuals"
## 9 "$ind$cos2" "cos2 for the individuals"
## 10 "$ind$contrib" "contributions of the individuals"
## 11 "$call" "summary statistics"
## 12 "$call$centre" "mean of the variables"
## 13 "$call$ecart.type" "standard error of the variables"
## 14 "$call$row.w" "weights for the individuals"
## 15 "$call$col.w" "weights for the variables"
3.1.1 Tabla general
eig.tmp <- PCA$eig
eig.tmp[,2:3]<-eig.tmp[,2:3]/100.
DT::datatable(eig.tmp) %>%
formatRound('eigenvalue',2) %>%
formatPercentage(c('percentage of variance','cumulative percentage of variance'),2)3.1.2 Gráfico de sedimentación
ggplot(data = data.frame(prop_varianza_acum = PCA$eig[,3], pc = 1:dim(PCA$eig)[1]),
aes(x = pc, y = prop_varianza_acum, group = 1)) +
geom_point() +
geom_line() +
theme_bw() +
labs(x = "Componente principal",
y = "Prop. varianza explicada acumulada")3.1.3 Tabla de cosenos cuadrados - individuos
DT::datatable(PCA$ind$cos2) %>%
formatPercentage(colnames(PCA$ind$cos2),2)3.1.4 Tabla de contribuciones - individuos
DT::datatable(PCA$ind$contrib) %>%
formatRound(colnames(PCA$ind$contrib),3)3.1.5 Tabla de cosenos cuadrados - variables
DT::datatable(PCA$var$cos2) %>%
formatPercentage(colnames(PCA$var$cos2),2)3.1.6 Tabla de contribuciones - variables
DT::datatable(PCA$var$contrib) %>%
formatRound(colnames(PCA$var$contrib),3)3.1.7 Plano principal - Cosenos cuadrados de individuos
fviz_pca_ind(PCA, col.ind="cos2", geom = "point",
gradient.cols = c("black", "#2E9FDF", "#FC4E07" ), title = "Cosenos cuadrados - individuos")3.1.8 Plano principal - Contribución de individuos
fviz_pca_ind(PCA, col.ind="contrib", geom = "point",
gradient.cols = c("black", "#2E9FDF", "#FC4E07" ), title = "Ejemplo 1 Contribución",repel = TRUE)3.1.9 Cículo de correlación - Cosenos cuadrados individuos
fviz_pca_var(PCA, col.var = "cos2",
gradient.cols = c("black", "blue", "red"),
ggtheme = theme_minimal())3.1.10 Cículo de correlación - Contribución individuos
fviz_pca_var(PCA, col.var = "contrib",
gradient.cols = c("black", "blue", "red"),
ggtheme = theme_minimal()) ### 3.1.11 Correlación entre variables originales y los componentes principales
library(corrplot)## corrplot 0.90 loaded
corrplot(PCA$var$cor)3.2 Análisis de correspondencia simple
3.2.1 Aplicación del Análisis de correspondencia simple (ACS)
ACS <- CA(df, graph = TRUE, dim(df)[2])3.2.2 Valores propios - inercia explicada
fviz_eig(ACS, linecolor = "#FC4E07",
barcolor = "#00AFBB", barfill = "#00AFBB")3.2.3 Plano principal - Cosenos cuadrados de individuos
fviz_ca_row(ACS, select.row = list(cos2 = 0.80), col.row = "cos2",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE)## Warning: ggrepel: 119 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
3.2.4 Plano principal - Cosenos cuadrado de variables
fviz_ca_col(ACS, col.col = "cos2",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"))3.2.4 Gráfico de sobreposición
fviz_ca_biplot(ACS, select.row = list(cos2 = 0.80), repel = TRUE)## Warning: ggrepel: 150 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
3.4 Sentido de la clusterización en el problema
df_scale <- scale(df, center = TRUE, scale = TRUE)
km.datos <- kmeans(x = df_scale, centers = 3)
p1 <- fviz_cluster(object = km.datos, data = df_scale,
ellipse.type = "norm", geom = "point", main = "Datos iris",
stand = FALSE, palette = "jco") +
theme_bw() + theme(legend.position = "none")
p1p2 <- fviz_dend(x = hclust(dist(df_scale)), k = 3, k_colors = "jco",
show_labels = FALSE, main = "Datos iris")## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
p2